{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 下载数据集\n",
    "\n",
    "数据集来自 http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/\n",
    "\n",
    "我们下载的是**EnglishFnt.tgz**，是印刷体数字加大小写字母。\n",
    "\n",
    "* [tqdm](https://github.com/tqdm/tqdm) 是一个进度条的库。\n",
    "* [requests](http://docs.python-requests.org/en/master/) 是一个对人类友好的 HTTP 库。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "48651KB [01:38, 492.01KB/s]                           \n"
     ]
    }
   ],
   "source": [
    "import requests\n",
    "from tqdm import tqdm\n",
    "import os\n",
    "\n",
    "fileurl = 'http://www.ee.surrey.ac.uk/CVSSP/demos/chars74k/EnglishFnt.tgz'\n",
    "filename = 'EnglishFnt.tgz'\n",
    "if not os.path.exists(filename):\n",
    "    r = requests.get(fileurl, stream=True)\n",
    "    with open(filename, 'wb') as f:\n",
    "        for chunk in tqdm(r.iter_content(1024), unit='KB', total=int(r.headers['Content-Length'])/1024): \n",
    "            f.write(chunk)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 解压数据集\n",
    "\n",
    "* [tarfile](https://docs.python.org/2/library/tarfile.html) 是 Python 自带的操作 tar 文件的库。\n",
    "* [shutil](https://docs.python.org/2/library/shutil.html) 是 Python 自带的高级文件操作库。\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 63055/63055 [00:11<00:00, 5350.48it/s]\n"
     ]
    }
   ],
   "source": [
    "import tarfile\n",
    "import shutil\n",
    "\n",
    "def mkdir(path):\n",
    "    if not os.path.exists(path):\n",
    "        os.makedirs(path)\n",
    "\n",
    "def rmdir(path):\n",
    "    if os.path.exists(path):\n",
    "        shutil.rmtree(path)\n",
    "\n",
    "with tarfile.open(filename, 'r') as tfile:\n",
    "    print 'loading'\n",
    "    members = tfile.getmembers()\n",
    "    for member in tqdm(members):\n",
    "        if tarfile.TarInfo.isdir(member):\n",
    "            mkdir(member.name)\n",
    "            continue\n",
    "        with open(member.name, 'wb') as f:\n",
    "            f.write(tfile.extractfile(member).read())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 分类数据集\n",
    "\n",
    "数据集有数字和大小写字母，但是我们只需要0123456789和非数字。\n",
    "\n",
    "因此将 A~Z,a~z 的图片移到 A 的文件夹，再将其他空文件夹删除。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 51/51 [00:03<00:00, 12.79it/s]\n"
     ]
    }
   ],
   "source": [
    "notnumdir = 'English/Fnt/Sample011/'\n",
    "for i in tqdm(range(12, 63)):\n",
    "    path = 'English/Fnt/Sample%03d/' % i\n",
    "    for filename in os.listdir(path):\n",
    "        os.rename(path+filename, notnumdir+filename)\n",
    "    os.rmdir(path)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 预处理数据集\n",
    "\n",
    "我们需要将这里的图片裁切为28*28，以便于输入到神经网络中。\n",
    "\n",
    "## 首先测试一张图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import cv2\n",
    "import numpy as np\n",
    "\n",
    "def resize(rawimg):  # resize img to 28*28\n",
    "    fx = 28.0 / rawimg.shape[0]\n",
    "    fy = 28.0 / rawimg.shape[1]\n",
    "    fx = fy = min(fx, fy)\n",
    "    img = cv2.resize(rawimg, None, fx=fx, fy=fy, interpolation=cv2.INTER_CUBIC)\n",
    "    outimg = np.ones((28, 28), dtype=np.uint8) * 255\n",
    "    w = img.shape[1]\n",
    "    h = img.shape[0]\n",
    "    x = (28 - w) / 2\n",
    "    y = (28 - h) / 2\n",
    "    outimg[y:y+h, x:x+w] = img\n",
    "    return outimg\n",
    "\n",
    "def convert(imgpath):\n",
    "    img = cv2.imread(imgpath)\n",
    "    gray = cv2.imread(imgpath, cv2.IMREAD_GRAYSCALE)\n",
    "    bw = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 25, 25)\n",
    "    img2, ctrs, hier = cv2.findContours(bw.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)\n",
    "    rects = [cv2.boundingRect(ctr) for ctr in ctrs]\n",
    "    x, y, w, h = rects[-1]\n",
    "    roi = gray[y:y+h, x:x+w]\n",
    "    return resize(roi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.image.AxesImage at 0x1144f1d10>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAgQAAAEBCAYAAAAHJ724AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAAPYQAAD2EBqD+naQAAIABJREFUeJzt3X90XHWd//HnOz+m+dWkaUtboMD2B8WqK5AgWBV0rbsF\n/Fr1688ox6Me9+svPHzz3f2i7LpfWTnruuyRsijssrrrqkA8SL9+gbPYakVZQEqPtIBAASutBUqS\ntvmdTJLJzPv7x50Jk2l+NnPnTpLX45x7krn3M5lXJjNz3/ncz/1cc3dERERkYSuJOoCIiIhETwWB\niIiIqCAQERERFQQiIiKCCgIRERFBBYGIiIiggkBERERQQSAiIiKoIBARERFUEIiIiAgRFwRm9gUz\nO2hmcTPbbWZvjDKPiIjIQhVZQWBmHwa+CXwVOB94AthpZsujyiQiIrJQWVQXNzKz3cCj7n5V+rYB\nLwI3ufv1kYQSERFZoCLpITCzcqAR+EVmnQeVyS5gUxSZRCR8OkwoUrzKInrc5UAp0Jazvg04J7ex\nmS0DtgCHgMGww4nMYxXAHwE73f14IR846zDh/wD2AM0Ehwk3uPuxcdrrfS+SH9N630dVEEzEgPGO\nYWwBbi9wFpH57GPAHQV+zGbgVnf/AYCZfRZ4F/ApYLzDhHrfi+TXpO/7qAqCY0ASWJmzfgUn9hpA\n8B8CtbW1nHfeeWM2bNmyhUsvvTSEiONrbm5m27ZtBXs8ZVCGk82wY8cOdu7cOWZdb28v+/btg/R7\nqlCyDhN+PbPO3d3MJjtMeAjgtttuY+PGjUXxXI9ntrlGRkZIJBKjX7OXoaEh2tvbaWtro62tjfb2\n9tHb7e3tDA6q4yRbTU0Np59++gnL6tWrOf300yktLR33fvP1tZWxf/9+rrjiCpjifR9JQeDuCTN7\nDNgM3AOjgwo3AzeNc5dBgPPOO48HHnigYDnHU1dXR0NDgzIoQ9FnaGho4K/+6q/GrNu7dy+NjY1Q\n+C74GR0mTBsE2LhxIw0NDUXxXI9ntrkSiQTDw8OjX7OXeDzO4sWLWbRoEWZGMpkkHo/T3d1NSYmm\nkclVWlpKVVUV9fX1rFy5kjPPPJO1a9eOLhMVBPP1tTWOSd/3UR4yuAH4frowyBxPrAL+I8JMIlJY\nEx0mXDBSqRQjIyMMDQ0Rj8cZHBxkcHCQeDxOf38/XV1d9PT0MDAwwODgIMPDwySTSaI6Q0zmr8gK\nAne/Mz3nwNcIDh08Dmxx96NRZRKR0Mz0MOGo5uZm6urq2LNnD1u3bgWgqamJpqamUIIWWjKZZHh4\nmP7+fvr6+sYsvb29tLe3c+zYMTo7O+nt7SUej5NIJFQQyLhaWlpoaWkZs667u3ta9410UKG73wLc\nEmUGEQnfSRwmHLVt2zYaGhrYunUr99xzT/hhCyyZTDI0NMTAwAA9PT10d3fT1dVFd3c3nZ2ddHR0\n0NHRQVdXF319faO9BKlUKuroUoTGK5azDhVOqtjOMpjUli1boo5QFP+VKIMyFFuGaZrVYcJi/T1n\nmyuVSjE8PMzAwAC9vb10dHRw/Pjx0SVTJPT09KiHICTz9bU1U5HNVDgTZtYAPPbYY48V5cAPkbki\n6z+FRnffW+jHN7PPA1fz6mHCL7r7byZouyDe9x0dHaNnEeQuR48eZWBggP7+fvr7+0/4fmRkJOr4\nRaWurm50AOGaNWvGDCicbFDhfDfd9/2c6iEQkblNhwlPlBlDkDlk0NHRQXt7O6+88gptbW0MDQ2d\nsCQSCR0ykLxTQSAiEqHcMQSdnZ0cPXqUV155hVdeeYWRkRGSyeTokrk9F3p3ZW5RQSAiEqFUKkUi\nkSAej9PX10dXVxfHjx+nvb2d1tbWGf2sYJzm+LdztxW7TMGT+1XCo4JARGSOMjNKS0spKSmhpKRk\n9Pvxvua2K9YCIZlMnjBjY2bRmIlwqSAQEZmjMgVBeXk5ZWVllJeXT/v7Yp3pMDNDY2YZGBggHo/j\n7ioIQqaCQERkjiopKRndyS9atGjKJRaLjX5fVlacH/+Dg4P09PSMLqWlpbg7iURC124IWXG+IkRE\nZEqZHoJYLEZFRQVVVVVUVlaOu+RuKy8vjzr+uPr7+zl+/DgVFRWjRUtmjIWESwWBiMgclekhyC4I\nqqurqampobq6esz3uetisVjU8cfV09NDRUXFaMGSSCQYGBgo2h6N+UTPsIjIHGVmJxQEixcvZvHi\nxdTW1o5+zSzZ6ysqKqKOP67Ozs7RCYSy52dQQRA+PcMiInPUeIcMampqqKurY8mSJScs2esrKyuj\njj+uY8eOjQ4gzBQD2T0GEh4VBCIic5SZjZ5KWFZWNjrAMBaLjQ4gzD2UkOklqKqqijr+uIaHh6mp\nqaGqqoqKigpisRhlZWVFe5rkfFKc552IiMi0uPsJSyqVGne9yGRUEIiIzAMqCmS2VBCIiMxR2dP6\njlcQ5BYF2fcRyaWCQERkDsvewU9UCKgYkOnQoMIF4MUXX+TFF1+kv7+fRCIRdZxZKS8vp6amhjPO\nOIPVq1dHHUekKOTu9CfrIRCZiAqCBeDhhx/m9ttv5+DBg3R1dUUdZ1aWLl3K2rVrueKKK/jABz4Q\ndRyRojHeYYOJxhKIjEcFwTyUSCQ4fPgwzz//PE8++SSPPPII+/bt4/jx43N+LvCOjg66urowMw4c\nOMBrXvMaNmzYwNlnn63zlGVBmu74ARUCMhUVBPNMIpGgu7ubffv2cd9997F9+3Z6enqijpU38Xh8\n9BDIrl27uPTSS7nssstYtmwZ9fX1RTsdq0gxUFEgk1FBMM8cPnyYxx9/nO3bt7N7924GBgaijhSa\nwcFBdu/ezcjICGbGm970JjZu3Bh1LBGROSnvBYGZXQO8D3gNEAd+DXzJ3Z/ParMIuAH4MLAI2Al8\n3t3b851noRgaGqK7u5snnniCnTt3smfPHg4dOhR1rFCNjIzw0ksvYWbEYjEqKytZsWIFixcvVk+B\niMgMhXHa4cXAt4CLgHcC5cDPzCx74uwbgXcB7wcuAU4DtoeQZcHo7e1l//793H///dx7770cOXIk\n6kgF09bWxq5du3jkkUc4cOAA/f39UUcSEZlz8t5D4O6XZ982s08A7UAj8JCZ1QKfAj7i7g+k23wS\n2G9mF7r7nnxnWgh6enr47W9/y4EDB+jo6GBkZCTqSAWTSCTo6enhhRde4KmnnmLVqlXU19dHHUtE\nZE4pxMRESwAHOtK3GwkKkV9kGrj7c8BhYFMB8sxLvb29PPvssxw5coTh4WFSqVTUkQomc2W0l19+\nmaeffpqOjg4SiYQGUImIzECoBYEFl6e6EXjI3Z9Jr14FDLt77tD3tvQ2OQmJRIKurq55PYhwKseO\nHeN3v/sdv/vd72htbV1QRZGIyGyF3UNwC/BaoGkabY2gJ0FmIJVK0dvby/Hjxzl27NiCLgh6eno4\ncuQIR48epaenRz0EIiIzENpph2b2beBy4GJ3zx7h1grEzKw2p5dgBUEvwYSam5upq6sbs66pqYmm\npunUG/NTKpWitbWVQ4cO0dbWtqAH1A0MDNDZ2Uk8HieZTC74gqClpYWWlpYx67q7uyNKIyLFLpSC\nIF0MvAd4m7sfztn8GDACbAZ+km6/ATgTeGSyn7tt2zYaGhryH3gOc3cGBgbo7e0lHo/P+WsVzEYq\nlWJ4eJi+vj4GBgYWfEEwXrG8d+9eGhsbI0okIsUsjHkIbiE4RLAV6DezlelN3e4+6O49ZvZvwA1m\n1gn0AjcBD+sMg5lzd4aHhxkcHJzxYEIzo7y8nOrqapYsWUIw5CN67k5PTw/9/f0MDQ1Ne8fu7iST\nSeLx+Jyfonm+MbOvAl/NWf2su782ijwicqIwegg+SzAW4Fc56z8J/CD9fTOQBO4imJhoB/CFELLI\nJMrLyznjjDN461vfygc/+EEqKyunvlMBxONxfvrTn/Lwww/z7LPPauc+fzxF0DOYqTwXzrmxInNA\nGPMQTDlQ0d2HgC+mF4lISUkJNTU1rFmzhre97W3U1NREHQmAvr4+Dhw4wDPPPENpaWnUcSR/Rtz9\naNQhRGR8hZiHQEQE4Gwze9nMfm9mt5nZGVEHEpFXqSAQkULYDXwC2EJwWHEN8F9mVh1lKBF5la52\nKCKhc/edWTefMrM9wB+ADwHfiyaViGRTQSAiBefu3Wb2PLB+qraaf0Rk+mYz/4gKAhEpODOrAdbx\n6plHE9L8IyLTN5v5RzSGQERCZ2b/aGaXmNlZZvZmgknJRoCWKe4qIgWiHgIRKYTVwB3AMuAo8BDw\nJnc/HmkqERmlgkBEQufuOuAvUuR0yEBERERUEIiIiIgKAhEREUEFgYiIiKCCQERERFBBICIiIqgg\nEBEREVQQiIiICCoIREREBBUEIiIiggoCERERQQWBiIiIoIJAREREUEEgIiIiFKAgMLNrzCxlZjdk\nrVtkZjeb2TEz6zWzu8xsRdhZREREZHyhFgRm9kbgz4EncjbdCLwLeD9wCXAasD3MLCIiIjKx0AoC\nM6sBbgM+DXRlra8FPgU0u/sD7r4P+CTwFjO7MKw8IiIiMrEwewhuBu519/tz1l8AlAG/yKxw9+eA\nw8CmEPOIiIjIBMrC+KFm9hHgPIKdf66VwLC79+SsbwNWhZFHREREJpf3gsDMVhOMEfhTd0/M5K6A\n5zuPiIiITC2MHoJG4BTgMTOz9LpS4BIzuxK4FFhkZrU5vQQrCHoJJtTc3ExdXd2YdU1NTTQ1NeUt\nvMh80dLSQktLy5h13d3dEaURkWIXRkGwC/jjnHX/AewHvgG8DCSAzcBPAMxsA3Am8MhkP3jbtm00\nNDTkOa7I/DResbx3714aGxsjSiQixSzvBYG79wPPZK8zs37guLvvT9/+N+AGM+sEeoGbgIfdfU++\n84iIiMjUQhlUOI7csQHNQBK4C1gE7AC+UKAsIiIikqMgBYG7vyPn9hDwxfQiIiIiEdO1DEREREQF\ngYjMnpldbGb3mNnL6WuXbB2nzdfM7IiZDZjZz81sfRRZRWR8KghEJB+qgccJxgKdMJ+ImX0JuBL4\nDHAh0A/sNLNYIUOKyMQKNahQROYxd99BMDiYrPlHsl0FXOfu96bbfJxg3pH3AncWKqeITEw9BCIS\nKjNbQzAtefb1S3qAR9H1S0SKhgoCEQnbKoLDCLkzker6JSJFRAWBiERF1y8RKSIaQyAiYWsl2Pmv\nZGwvwQpg31R31jVMRKZvNtcwUUEgIqFy94Nm1kpw/ZInAcysFrgIuHmq++saJiLTN5trmKggEJFZ\nM7NqYD1BTwDAWjM7F+hw9xcJLon+FTM7ABwCrgNeAu6OIK6IjEMFgYjkwwXALwnGBDjwzfT67wOf\ncvfrzawKuBVYAjwIXObuw1GEFZETqSAQkVlz9weYYpCyu18LXFuIPCIyczrLQERERFQQiIiIiAoC\nERERQQWBiIiIoIJAREREUEEgIiIiqCAQERERVBCIiIgIKghEREQEFQQiIiJCSAWBmZ1mZj80s2Nm\nNmBmT5hZQ06br5nZkfT2n5vZ+jCyiIiIyNTyXhCY2RLgYWAI2AJsBP4C6Mxq8yXgSuAzwIVAP7DT\nzGL5ziMiIiJTC+PiRl8GDrv7p7PW/SGnzVXAde5+L4CZfRxoA94L3BlCJhEREZlEGIcM3g38xszu\nNLM2M9trZqPFgZmtAVYBv8isc/ce4FFgUwh5REREZAphFARrgc8BzwF/BvwLcJOZXZHevorgeult\nOfdrS28TERGRAgvjkEEJsMfd/yZ9+wkzex1BkXDbJPczgkJBRERECiyMguAVYH/Ouv3Af09/30qw\n81/J2F6CFcC+yX5wc3MzdXV1Y9Y1NTXR1NQ0m7wi81JLSwstLS1j1nV3d0eURkSKXRgFwcPAOTnr\nziE9sNDdD5pZK7AZeBLAzGqBi4CbJ/vB27Zto6GhYbImIpI2XrG8d+9eGhsbI0okIsUsjIJgG/Cw\nmV1DcMbARcCngT/PanMj8BUzOwAcAq4DXgLuDiGPiIiITCHvBYG7/8bM3gd8A/gb4CBwlbv/KKvN\n9WZWBdwKLAEeBC5z9+F85xEREZGphdFDgLvfB9w3RZtrgWvDeHwRERGZGV3LQERERFQQiIiIiAoC\nERERQQWBiIiIoIJAREREUEEgIiIiqCAQkTwws4vN7B4ze9nMUma2NWf799Lrs5dJT00WkcJSQSAi\n+VANPA58gYkvUvZTgmuYrEovugiJSBEJZWIiEVlY3H0HsAPAzGyCZkPufrRwqURkJtRDICKF8nYz\nazOzZ83sFjNbGnUgEXmVeghEpBB+CmwnuLbJOuDvgfvMbJO7T3SIQUQKSAWBiITO3e/Muvm0mf0W\n+D3wduCXkYQSkTFUEIhIwbn7QTM7BqxnioKgubmZurq6MeuamppoatKYRJFcLS0ttLS0jFnX3d09\nrfuqIBCRgjOz1cAy4JWp2m7bto2GhobwQ4nMA+MVy3v37qWxsXHK+6ogEJFZM7Nqgv/2M2cYrDWz\nc4GO9PJVgjEErel2/wA8D+wsfFoRGY8KAhHJhwsIuv49vXwzvf77wOeBNwAfB5YARwgKgf/j7onC\nRxWR8aggEJFZc/cHmPw05ksLlUVETo7mIRAREREVBCIiIqKCQERERFBBICIiIqggEBEREUIoCMys\nxMyuM7MXzGzAzA6Y2VfGafc1MzuSbvNzM1uf7ywiIiIyPWH0EHwZ+AzBucevAa4GrjazKzMNzOxL\nwJXpdhcC/cBOM4uFkEdERESmEMY8BJuAu9PXRwc4bGYfJdjxZ1wFXOfu9wKY2ceBNuC9QPZFUERE\nRKQAwugh+DWw2czOBkhPX/oW4L707TXAKuAXmTu4ew/wKEExISIiIgUWRg/BN4Ba4FkzSxIUHX/t\n7j9Kb19FMLVpW8792tLbREREpMDCKAg+DHwU+AjwDHAe8E9mdsTdfzjJ/YygUBAREZECC6MguB74\nurv/OH37aTP7I+Aa4IcEVzszYCVjewlWAPsm+8G6LrrI9M3muugisvCEURBUceJ/+inS4xXc/aCZ\ntQKbgScBzKwWuAi4ebIfrOuii0zfbK6LLiILTxgFwb3AX5vZi8DTQAPQDHw3q82NwFfM7ABwCLgO\neAm4O4Q8IiIiMoUwCoIrCXbwNxMcBjgC/HN6HQDufr2ZVQG3Elwf/UHgMncfDiGPiIiITCHvBYG7\n9wP/K71M1u5a4Np8P76IiIjMnK5lICIiIioIRERERAWBiIiIoIJAREREUEEgIiIiqCAQERERVBCI\niIgIKghEREQEFQQiIiKCCgIRERFBBYGIiIiggkBE8sDMrjGzPWbWY2ZtZvYTM9uQ02aRmd1sZsfM\nrNfM7jKzFVFlFpGxVBCISD5cDHwLuAh4J1AO/MzMKrPa3Ai8C3g/cAlwGrC9wDlFZAJhXP5YRBYY\nd788+7aZfQJoBxqBh8ysFvgU8BF3fyDd5pPAfjO70N33FDiyiORQD4GIhGEJ4EBH+nYjwT8gv8g0\ncPfngMPApoKnE5ETqCAQkbwyMyM4PPCQuz+TXr0KGHb3npzmbeltIhIxHTIQkXy7BXgt8NZptDWC\nngQRiZgKAhHJGzP7NnA5cLG7H8na1ArEzKw2p5dgBUEvwYSam5upq6sbs66pqYmmpqY8pRaZP1pa\nWmhpaRmzrru7e1r3VUEgInmRLgbeA7zN3Q/nbH4MGAE2Az9Jt98AnAk8MtnP3bZtGw0NDfkPLDIP\njVcs7927l8bGxinvq4JARGbNzG4BmoCtQL+ZrUxv6nb3QXfvMbN/A24ws06gF7gJeFhnGIgUBxUE\nIpIPnyUYC/CrnPWfBH6Q/r4ZSAJ3AYuAHcAXCpRPRKaggkBEZs3dpzxjyd2HgC+mFxEpMjM+7dDM\nLjaze8zsZTNLmdnWcdp8zcyOmNmAmf3czNbnbK83s9vNrNvMOs3su2ZWPZtfRERERE7eycxDUA08\nTtDVd8LpQmb2JeBK4DPAhUA/sNPMYlnN7gA2EgwwehfBNKa3nkQWERERyYMZHzJw9x0Ex/4yE5Dk\nugq4zt3vTbf5OMFpRe8F7jSzjcAWoNHd96XbfBH4TzP7S3dvPanfRERERE5aXmcqNLM1BLOOZU9P\n2gM8yqvTk74J6MwUA2m7CHobLspnHhEREZmefE9dvIpgx5470Uj29KSrCC56MsrdkwRznmsKUxER\nkQgU6loG05meVFOYioiIRCTfpx22EuzYVzK2l2AFsC+rzYrsO5lZKVCPpjAVyZvZTGEqIgtPXgsC\ndz9oZq0EZw88CZC+DvpFwM3pZo8AS8zs/KxxBJsJColHJ/v5msJUZPpmM4WpiCw8My4I0vMFrCfY\ngQOsNbNzgQ53f5HgsqdfMbMDwCHgOuAl4G4Ad3/WzHYC3zGzzwEx4FtAi84wEBERicbJ9BBcAPyS\n4Hi/A99Mr/8+8Cl3v97MqgjmFVgCPAhc5u7DWT/jo8C3Cc4uSBFMZXrVSf0GIiIiMmsnMw/BA0wx\nGNHdrwWunWR7F3DFTB9bREREwlGoswxERESkiOniRgtYKpWir6+PQ4cO8eCDD1JZWRl1JADi8TgH\nDhygq6uLZDIZdRwRkQVBBcEClkgkePHFF7nnnnt48MEHGX8m6sJzd7q7u+nv72doaCjqOCIiC4IK\ngjnOzIjFYlRUVBCLxSgpmf5RIHdneHiYjo4OOjo6QkxZGGZGaWkplZWVVFRURB1HRGRO0RiCOc7M\nqKqqYvHixVRWVlJeXh51pMiUlJRQXl5OTU0NVVVVRdPjISIyF6ggmONKSkpYuXIlZ511FitXrqS6\nujrqSJGpqqqivr6eyspKSktLVRCIiMyACoI5rqSkhNraWpYvX87y5cupqqqKOlJkamtrOe200zjl\nlFNYvHixCgIRkRlQQTBPlJeXs2TJkqI5UyAKy5Yt4+yzz2b9+vWsWrVqRuMpREQWOn1izhM1NTWc\nc845nH766TMeXDjXZQYTrl69mte97nUsW7aMWCymHgIRkRlYOHuNea6uro43vOENrFu3jvr6+gU1\nuLCsrIza2lrWrFnD61//ehYvXhx1JBGROUenHc4TixcvZuPGjbzjHe9geHiY+++/nxdeeCHqWAWx\natUqNm3axKZNm1i/fj01NTVRRxIRmXNUEMwTixYtYsWKFZx33nm4O729vSSTSV5++WVGRkaijheK\nsrIyVq5cSUNDA1u2bOHcc89l+fLlUccSEZmTVBDMM2eccQa1tbW4O1VVVWzfvp2enp6oY4WioqKC\nTZs2cemll3L55ZdTX18fdSQRkTlLBcE8E4vFWLJkCQ0NDVRXV3P22Weze/duHnvsMY4fP87g4GDU\nEWelsrKSZcuW0djYyIUXXsjGjRvZsGEDS5cuJRaLRR1PRGTOUkEwD5WXl7Nu3TrWrVvH5Zdfzo9+\n9CNGRkY4ePAgXV1dUceblaVLl7Ju3To+9rGP8cEPfjDqOCKRM7Nxl5KSknHXi0xEBcEC8OY3v5nV\nq1fT399PIpGIOs6sxGIxqqurOeuss6KOIlJUMjv73GJgvMJAZDwqCBaAM888kzPPPDPqGDKPmdk1\nwPuA1wBx4NfAl9z9+aw2vwIuybqbA7e6++cLGHVemqiHILcYEJmM5iEQkXy4GPgWcBHwTqAc+JmZ\nZU+d6cC/AiuBVcCpwNUFzjnvZO/oJyoEsnsPRCaiHgIRmTV3vzz7tpl9AmgHGoGHsjYNuPvRAkab\n17J39NMZQ5B9H5Fc6iEQkTAsIegR6MhZ/zEzO2pmvzWzr+f0IMgsTOeQgYoBmYx6CEQkryzY69wI\nPOTuz2Rtuh34A3AEeANwPbAB+EDBQ84j0+kdyLQTmYwKAhHJt1uA1wJvyV7p7t/Nuvm0mbUCu8xs\njbsfLGTA+cLdSaVSpFIpkskkyWSSkZEREokEiUSC4eFhhoaGGBwcZGBggFgsRiwWo7y8nGQyGXX8\ncfX19TEwMEA8HmdoaIhEIkEymcTdo4427824IDCzi4H/TXBs8FTgve5+T3pbGfB3wGXAWqAb2AV8\n2d1fyfoZ9cC3gf8GpIDtwFXu3j+r30ZEImVm3wYuBy7Ofs9P4FHAgPXAhAVBc3MzdXV1Y9Y1NTXR\n1NQ0y7Rzn7uTTCYZHh4e3emXlpZiZrg7IyMjY7b19fXR3d1NZ2cnFRUVUccfV2dnJy+//DLt7e10\ndHTQ29tLPB6ft1Ow51tLSwstLS1j1nV3d0/rvifTQ1ANPA78O8GOPFsVcB7wt8CTQD1wE3A3cGFW\nuzsIRhpvBmLAfwC3AlecRB4RKQLpYuA9wNvc/fA07nI+wTiDSQuHbdu20dDQkIeE80/uTj+7GMgU\nCkNDQ8Tj8dFioKamhpqamqKd2bOnp4fW1lba29vp7OwcLQjm+hwqhTJesbx3714aGxunvO+MCwJ3\n3wHsgNFjhdnbeoAt2evM7ErgUTNb7e4vmdnGdJtGd9+XbvNF4D/N7C/dvXWmmUQkWmZ2C9AEbAX6\nzWxlelO3uw+a2Vrgo8B9wHHgXOAG4AF3fyqKzPNB5lBBIpFgcHBwTDGQWZfpGaisrKSyspKqqioq\nKyuL9hLpfX19dHR0jC49PT0MDg6qh6AACjGGIDPaODNn7puAzkwxkLYr3eYigt4EEZlbPkvwHv5V\nzvpPAj8AhgnmJ7iKoJfxReDHBIcY5SRl9xBMVAwsWrRo3KWsrDiHkMXjcXp6ekYXHTIonFBfEWa2\nCPgGcIe796VXryI4P3mUuyfNrCO9TUTmGHef9BRmd38JeHth0iwcqVSKkZGREw4TDA4OUl5ePrqU\nlZWNuV1eXk5JSXGedT48PEw8Hh8dWJj5qoIgfKEVBOkBhj8m+K9hOlOTWrqtiIhMQ6YIyO4ZKCkp\nobS0dPTrRN8X62mImd8jc5ZE5nuNIQhfKAVBVjFwBvCOrN4BgFZgRU77UoIBiG2T/VyNNhaZvtmM\nNpa5IXPIwMxGd5i5UxlPtK6YufvoaYaZ73XaYfjyXhBkFQNrgT9x986cJo8AS8zs/KxxBJsJegge\nnexna7SxyPTNZrSxFE5JSQnl5eVUVFRQXV1NXV0dS5cupa+vb/Qc/ImW7J1mhnaccrJOZh6CaoLz\nhjNl5lqykImeAAAKXklEQVQzO5dgitIjBKcinkcwx0B51mjjDndPuPuzZrYT+I6ZfY7gtMNvAS06\nw0BEFprS0lIWLVpEVVUVtbW11NfXMzAwQCKRwMxGTx3MLJnbqVRKO3/Jq5PpIbgA+CXB8X4Hvple\n/32C+QfenV7/eHp9ZmzAnwD/lV73UYKJiXYRTEx0F8HoYxGRBaW0tJRYLEZ1dTW1tbWj59y7O2Vl\nZfT39zMwMDD6dWBggFQqRSKRIJVKRR1f5pGTmYfgASa/KNKUQ1fdvQtNQiQiQklJCbFYjMrKSmpr\naxkeHiaZTGJmlJeXj55+lzkzIJVKjZ5mKJJPxXkiqojIApE5ZFBdXT36X392kdDR0UEsFqOkpAR3\nJ5FIEI/HVRBI3hXniagT2LFjR9QRThi1rQzKELViyFAIxfp7zjZX5pBB9hiCU045hVNPPZXTTz+d\nVatWsXz5cpYsWUJNTc3oLIPFOo/AXDRfX1szNadeUTt37ow6QlG8cJRBGYotQyEU6++Zj4IgM6hw\n8eLFLF26dExBsHLlSpYvX059fT2LFy8eLQjUQ5A/8/W1NVM6ZCAiEqGSkhLKysqoqKgYPQVx0aJF\nDA8PU1lZydDQEAMDA/T09FBZWUksFqOsrEwFgeTdnOohEBERkXCoIBAREZE5c8igAqC3t5e9e/dG\nGqS7u1sZlGHOZti/f3/m24rQAuVPBbyauRie6/HMNtfIyAiJRGL0a/YyNDREe3s7bW1ttLW1cfz4\ncXp7exkcHNQcBONIJpMMDAzQ2dlJVVUVZWVlJJNJ4vE4XV1dlJaWjnu/+fraypju+97mwkxXZvZR\n4Paoc4jMIx9z9zuiDjEZve9F8m7S9/1cKQiWAVuAQ8BgtGlE5rQK4I+Ane5+POIsk9L7XiRvpvW+\nnxMFgYiIiIRLgwpFREREBYGIiIioIBARERFUEIiIiAhzpCAwsy+Y2UEzi5vZbjN7Y4iPdY2Z7TGz\nHjNrM7OfmNmGnDaLzOxmMztmZr1mdpeZrQg5U8rMbihkBjM7zcx+mH6MATN7wswactp8zcyOpLf/\n3MzW5/HxS8zsOjN7If3zD5jZV8Zpl7cMZnaxmd1jZi+nn/OtM308M6s3s9vNrNvMOs3su2ZWnY8M\nZlZmZv9gZk+aWV+6zffN7NR8Zig2hfwMmGaer6b/NtnLMxHkmPXrtdCZzOx74zx394WZKf24xfrZ\nPlWmX+U8V0kzuyWMPEVfEJjZh4FvAl8FzgeeAHaa2fKQHvJi4FvARcA7gXLgZ2ZWmdXmRuBdwPuB\nS4DTgO1hhEl/8P05we+dLdQMZrYEeBgYIjj1ayPwF0BnVpsvAVcCnwEuBPoJ/jaxPMX4cvpnfx54\nDXA1cLWZXRlihmrgceALwAmn4Ezz8e4geL42E/yNLgFuzVOGKuA84G8J3g/vA84B7s5pN9sMRSOC\nz4DpegpYCaxKL2+NIEM+Xq8FzZT2U8Y+d00h5skoqs/2GWRy4F959fk6leCzMP/cvagXYDfwT1m3\nDXgJuLpAj78cSAFvTd+uJdhJvi+rzTnpNhfm+bFrgOeAdwC/BG4oVAbgG8ADU7Q5AjRn3a4F4sCH\n8pThXuA7OevuAn5QiAzp53PrTH5ngp1wCjg/q80WYARYlY8M47S5AEgCq8PIEPUS9WfABJm+CuyN\n+rmZ6rUS9nv0JDN9D/i/RfB8RfbZPt1M6XWjn/1hL0XdQ2Bm5UAj8IvMOg+eoV3ApgLFWEJQoXWk\nbzcSTPmcnek54HAImW4G7nX3+3PWX1CADO8GfmNmd6a7svaa2aczG81sDUG1mp2hB3g0jxl+DWw2\ns7PTj3ku8BbgvgJmGDXNx3sT0Onu+7LuuovgNXRRvjOlZV6jXRFmCEWRfAZM5Ox0t/jvzew2Mzsj\n4jxjFPr9MUNvT3+uPGtmt5jZ0ggyRPnZPt1MGR8zs6Nm9lsz+3pOD0LeFPu1DJYDpUBbzvo2gsot\nVGZmBF1ID7l75vjgKmA4/cbKzbQqj4/9EYKu4QvG2byyABnWAp8j6Kr9O4IdyU1mNujut6Ufxxn/\nb5OvDN8gqNqfNbMkwSGuv3b3H6W3FyJDtuk83iqgPXujuyfNrCOMTGa2iOB5usPd+6LIELJIPwMm\nsRv4BEEP3qnAtcB/mdnr3b0/wlzZCv3+mK6fEnTDHwTWAX8P3Gdmm9LFXuii/GyfYSYIpu/+A0Fv\nzxuA64ENwAfynaHYC4KJGBMfm8qnW4DXMr1jg3nLZGarCV4Yf+ruiZncNV8ZCHa+e9z9b9K3nzCz\n1xEUCbcVKMOHgY8CHwGeISiQ/snMjrj7DwuUYTqm83h5z2RmZcCP0z/389O5S74zRCjS38Xdd2bd\nfMrM9hB8aH+IoEu8mEX93N2ZdfNpM/st8Hvg7QTd44UQyWf7FDKZ3pK90t2/m3XzaTNrBXaZ2Rp3\nP5jPAEV9yAA4RnBsdGXO+hWcWPXmlZl9G7gceLu7H8na1ArEzKw2xEyNwCnAY2aWMLME8DbgKjMb\nTj/OopAzvALsz1m3Hzgz/X0rwRslzL/N9cDfu/uP3f1pd78d2AZcU8AM2abzeK3p26PMrBSoz2em\nrGLgDODPsnoHCpahQCL7DJgJd+8GngdCHcE/Q4V+f5yU9E7tGAV67iL+bJ9OplemaP4owd81789X\nURcE6f+OHyMYKQ2MdqtsJji+HIr0H+c9wJ+4++GczY8RDM7KzrSBYEf5SJ4i7AL+mOA/4nPTy28I\n/jPPfJ8IOcPDnNglew7Bf0GZN3FrToZagkML+frbVHFiZZ4i/botUIZR03y8R4AlZnZ+1l03E7yB\nH81HjqxiYC2w2d07c5qEnqFQovoMmCkzqyHo/p7qw7xgCv3+OFnpHtFlFOC5K4LP9plmGs/5BJ+L\n+X++CjFycZajLj9EMCr24wSnnt0KHAdOCenxbiE4te5igso6s1TktDlI0MXVSLDzfDDk52HMSNOw\nMxCMXRgi+G98HUHXfS/wkaw2V6f/Fu8mKGD+H/A7IJanDN8jGNBzOXAWwSl27cDXw8pAcMrUuQTF\nWAr4n+nbZ0z38QgGPf4GeCNB999zwA/zkYHgePrdBIXZH+e8RsvzlaGYlkJ/Bkwz0z8SnJZ2FvBm\n4OcE/0UuK3COWb9eC5kpve16gqLkLIKd728Ieh/Lw8qUzlV0n+1TZSIo+r8CNKSfr63AAeD+UPIU\n8sU7iyft8wSXQI0TVGoXhPhYKYIuytzl41ltFhGcO3qMYCf5Y2BFyM/B/YwtCELPQLAjfhIYAJ4G\nPjVOm2sJBrsMADuB9Xl8/GrghvQbtD/9Qfa3QFlYGQgOzYz3Gvj36T4ewUjh24Du9Jv9O0BVPjKk\nPxRyt2VuX5KvDMW2FPIzYJp5WghOfYwTFK13AGsiyDHr12shMxFchncHQc/FIPAC8M8UoLibIFOk\nn+1TZQJWA78Cjqb/fs8RDMKsCSOPLn8sIiIixT2GQERERApDBYGIiIioIBAREREVBCIiIoIKAhER\nEUEFgYiIiKCCQERERFBBICIiIqggEBEREVQQiIiICCoIREREBBUEIiIiAvx/SawLn9+R+wUAAAAA\nSUVORK5CYII=\n",
      "text/plain": [
       "<matplotlib.figure.Figure at 0x112b51c10>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "imgpath = 'English/Fnt/Sample001/img001-00001.png'\n",
    "img = cv2.imread(imgpath)\n",
    "rsz = convert(imgpath)\n",
    "\n",
    "plt.subplot(1, 2, 1)\n",
    "plt.imshow(img, cmap='gray')\n",
    "plt.subplot(1, 2, 2)\n",
    "plt.imshow(rsz, cmap='gray')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 预处理所有图片"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "train/0/: 100%|██████████| 1016/1016 [00:00<00:00, 1150.59it/s]\n",
      "train/1/: 100%|██████████| 1016/1016 [00:00<00:00, 1232.67it/s]\n",
      "train/2/: 100%|██████████| 1016/1016 [00:00<00:00, 1201.22it/s]\n",
      "train/3/: 100%|██████████| 1016/1016 [00:00<00:00, 1186.00it/s]\n",
      "train/4/: 100%|██████████| 1016/1016 [00:00<00:00, 1176.58it/s]\n",
      "train/5/: 100%|██████████| 1016/1016 [00:01<00:00, 936.41it/s]\n",
      "train/6/: 100%|██████████| 1016/1016 [00:01<00:00, 897.20it/s]\n",
      "train/7/: 100%|██████████| 1016/1016 [00:01<00:00, 997.21it/s]\n",
      "train/8/: 100%|██████████| 1016/1016 [00:01<00:00, 973.86it/s]\n",
      "train/9/: 100%|██████████| 1016/1016 [00:01<00:00, 961.00it/s]\n",
      "train/10/: 100%|██████████| 52832/52832 [00:52<00:00, 999.49it/s] \n"
     ]
    }
   ],
   "source": [
    "rmdir('train')\n",
    "\n",
    "for i in range(11):\n",
    "    path = 'English/Fnt/Sample%03d/' % (i+1)\n",
    "    trainpath = 'train/%d/' % i\n",
    "    mkdir(trainpath)\n",
    "    for filename in tqdm(os.listdir(path), desc=trainpath):\n",
    "        try:\n",
    "            cv2.imwrite(trainpath + filename, convert(path + filename))\n",
    "        except:\n",
    "            pass"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 分离出验证数据集\n",
    "\n",
    "http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "for i in range(11):\n",
    "    trainpath = 'train/%d/' % i\n",
    "    validpath = 'valid/%d/' % i\n",
    "    mkdir(validpath)\n",
    "    imgs = os.listdir(trainpath)\n",
    "    trainimgs, validimgs = train_test_split(imgs, test_size=0.1)\n",
    "    for filename in validimgs:\n",
    "        os.rename(trainpath+filename, validpath+filename)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
